#   LibAut
#       onset regressions: simple (bivariate) leave-pair-out CV
#       LPO-CV for Figure 6
#   Juraj Medzihorsky
#   2021-08-10

library(mgcv)
library(parallel)
options(mc.cores=14) #  written for 16 thread ubuntu machine 
options(stringsAsFactors=F)
source('helpers.R')

#   sessionInfo()
##  R version 4.0.4 (2021-02-15)
##  Platform: x86_64-pc-linux-gnu (64-bit)
##  Running under: Ubuntu 20.04.2 LTS
##
##  Matrix products: default
##  BLAS/LAPACK: /opt/OpenBLAS/lib/libopenblas_haswellp-r0.3.13.so
##
##  locale:
##   [1] LC_CTYPE=C.UTF-8           LC_NUMERIC=C
##   [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C.UTF-8
##   [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C.UTF-8
##   [7] LC_PAPER=C.UTF-8           LC_NAME=C
##   [9] LC_ADDRESS=C               LC_TELEPHONE=C
##  [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
##  attached base packages:
##  [1] parallel  stats     graphics  grDevices utils     datasets  methods
##  [8] base
##
##  other attached packages:
##  [1] mgcv_1.8-34    nlme_3.1-152   nvimcom_0.9-92
##
##  loaded via a namespace (and not attached):
##  [1] compiler_4.0.4  Matrix_1.3-2    tools_4.0.4     splines_4.0.4
##  [5] grid_4.0.4      lattice_0.20-41


#------------------------------------------------------------------------------
#   data load

code_dir <- getwd()
data_dir <- gsub('code$', 'data', code_dir)

setwd(data_dir)
dat <- readRDS('onset_20210810.rds')
eps <- read.csv('eps.csv')
setwd(code_dir)
dat <- subset(dat, onset_eligible%in%1)

#------------------------------------------------------------------------------
#   data prep

tab_dem <- aggregate(eps$dem_ep, by=list(eps$year), FUN=mean, na.rm=T)
tab_aut <- aggregate(eps$aut_ep, by=list(eps$year), FUN=mean, na.rm=T)
names(tab_dem) <- c('year', 'share_dem_ep')
names(tab_aut) <- c('year', 'share_aut_ep')

tab <- merge(tab_dem, tab_aut, by='year')

tab_lag <- tab
tab_lag$year <- tab$year + 1
names(tab_lag)[2:3] <- paste0('lag1_', names(tab)[2:3])

tab_tab <- merge(tab, tab_lag)

dat <- merge(dat, tab_tab, by='year')
weird <- which(abs(dat$lag1_e_migdpgro)>1)

dat$lag1_e_migdpgro[weird] <- NA

dat$logpop <- log(dat$e_mipopula) 

dat$reg_fac <- as.factor(dat$e_regionpol_6C)

#------------------------------------------------------------------------------

#   formulas
fy0 <- 'libaut_start ~ 1'
#
dummies <- c('reg_fac', 'e_miinteco', 'e_miinterc')
#
conties <- 
    c('e_migdppcln', 'e_migdpgro', 'logpop', 'v2pepwrsoc', 'v2xeg_eqdr',
      'v2xnp_pres', 'v2csprtcpt', 'v2x_polyarchy', 'excl_region_edi', 'year', 
      'share_dem_ep', 'share_aut_ep')  
#
dummies <- c(dummies[1], paste0('lag1_', dummies[2:3]))
conties <- c(paste0('lag1_', conties[1:9]), conties[10],
             paste0('lag1_', conties[11:12]))
#
fd <- sapply(paste0(fy0, ' + ', dummies), as.formula)
fc <- sapply(paste0(fy0, ' + ', 's(', conties, ', bs="gp")'), as.formula)
fc[[9]] <- as.formula(paste0(fy0, ' + ', 's(', conties[9], ', bs="re")'))


#   add fitted (predicted) values and their ses to a model object
getfitted <- 
    function(x) 
    {
        d <- x$model
        f <- predict(x, newdata=d, se.fit=T, type='response')
        d$fit <- f$fit
        d$se.fit <- f$se.fit
        return(d)
    }


#   bernoulli-logistic GLM with discrete X j with left-out 0-1 pair i 
getoned <-
    function(i, j, dd=pd)
    {
        ii <- unlist(gp_d[[j]][i,])
        odat <- dd[[j]][-ii,]
        ndat <- dd[[j]][ ii,]
        m <- glm(fd[[j]], data=odat, family=binomial('logit'))
        p <- predict(m, newdata=ndat, type='response')
        auc <- as.numeric(p[1]<p[2])
        cc <- mean(round(p)==ndat$libaut_start)
        return(data.frame(auc=auc, cc=cc))
    }


#   bernoulli-logistic GAM with continuous X j with left-out 0-1 pair i 
getonec <-
    function(i, j, dd=pc)
    {
        ii <- unlist(gp_c[[j]][i,])
        odat <- dd[[j]][-ii,]
        ndat <- dd[[j]][ ii,]
        m <- gam(fc[[j]], data=odat, family=binomial('logit'))
        p <- predict(m, newdata=ndat, type='response')
        auc <- as.numeric(p[1]<p[2])
        cc <- mean(round(p)==ndat$libaut_start)
        return(data.frame(auc=auc, cc=cc))
    }

#------------------------------------------------------------------------------
#   prep

#   fit to the avialble subsets
ld <- mclapply(fd, glm, data=dat, family=binomial('logit'))
lc <- mclapply(fc, gam, data=dat, family=binomial('logit'))
#   extract fitted values
pd <- mclapply(ld, getfitted) 
pc <- mclapply(lc, getfitted) 
#   make grids of all possible 0-1 y subsets for each X
gp_d <- lapply(pd, function(x) expand.grid(y0=which(x$libaut_start%in%0), y1=which(x$libaut_start%in%1)))
gp_c <- lapply(pc, function(x) expand.grid(y0=which(x$libaut_start%in%0), y1=which(x$libaut_start%in%1)))
#   extract available subset sizes for each X
n_d <- sapply(gp_d, nrow)
n_c <- sapply(gp_c, nrow)

#   prepare lists for storing the output
out_d <- vector(length=length(fd), mode='list')
out_c <- vector(length=length(fc), mode='list')
names(out_d) <- dummies
names(out_c) <- conties

#------------------------------------------------------------------------------
#   execute

old_time <- Sys.time()
n_cv <- 1e3
set.seed(123)
for (k in 1:length(out_d)) {
    rr <- n_d[k]
    vec <- sample(1:rr, n_cv, replace=F)
    out_d[[k]] <- do.call(rbind, mclapply(vec, function(ii) getoned(ii, j=k)))
}

set.seed(321)
for (k in 1:length(out_c)) {
    rr <- n_c[k]
    vec <- sample(1:rr, n_cv, replace=F)
    out_c[[k]] <- do.call(rbind, mclapply(vec, function(ii) getonec(ii, j=k)))
}
new_time <- Sys.time()

#   3.2 hours
round(new_time - old_time, 1)

#------------------------------------------------------------------------------
#   save

setwd(data_dir)
save(list=c('out_d', 'out_c'), file='onset_regressions_simple_lpo_20210810.rdata')

#   SCRIPT END